Feature Engineering:
Assessments:
exams <- assessments[assessments$assessment_type == "Exam", ] #rows od type exam
others <- assessments[assessments$assessment_type != "Exam", ] #rows of other assessments
amounts <- table(others$code_module, others$code_presentation)
amounts
##
## 2013B 2013J 2014B 2014J
## AAA 0 5 0 5
## BBB 11 11 11 5
## CCC 0 0 8 8
## DDD 13 6 6 6
## EEE 0 4 4 4
## FFF 12 12 12 12
## GGG 0 9 9 9
# Convert the table to a data frame
amounts_df <- as.data.frame(amounts)
amounts_df
names(amounts_df) <- c("code_module", "code_presentation", "count")
head(amounts_df)
# here, we have the total number of assessments by module
# A function to determine whether a student has passed a given assessment
pass_fail <- function(grade) {
if (is.na(grade)) {
return(NA)
} else if (grade >= 40) {
return(TRUE)
} else {
return(FALSE)
}
}
#Creating a student assessment dataframe to join infos about the assessment weights and their respective grades
modifiedStudentAssessment<-merge(studentAssessment, others, by = "id_assessment", all.x = TRUE)
modifiedStudentAssessment
modifiedStudentAssessment<-mutate(modifiedStudentAssessment, pass = sapply(score, pass_fail))
modifiedStudentAssessment
modifiedStudentAssessment<-mutate(modifiedStudentAssessment, weighted_grade = score * weight / 100)
modifiedStudentAssessment
#Final assessment average per student per module
avg_grade <- modifiedStudentAssessment %>%
group_by(id_student, code_module, code_presentation) %>%
summarize(total_weighted_grade = sum(weighted_grade, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'id_student', 'code_module'. You can
## override using the `.groups` argument.
avg_grade
head(avg_grade)
# Filter rows where pass is TRUE, group by id_student, code_module, and code_presentation, and count pass occurrences
pass_df <- subset(modifiedStudentAssessment, pass == TRUE)
pass_df
pass_count <- aggregate(pass ~ id_student + code_module + code_presentation, data = pass_df, FUN = length)
pass_count
pass_rate <- merge(pass_count, amounts_df, by = c("code_module", "code_presentation"), all.x = TRUE)
pass_rate
pass_rate$pass_rate <- pass_rate$pass / pass_rate$count
pass_rate
pass_rate <- pass_rate[, c("id_student", "code_module", "code_presentation", "pass_rate")]
head(pass_rate)
pass_rate
# Final exam scores
stud_exams <- transform(merge(modifiedStudentAssessment, exams, by = "id_assessment", all.x = FALSE), exam_score = score)
stud_exams
columns_to_drop_indices <- c(1, 3, 4, 6:12, 15:20)
stud_exams <- stud_exams[, -columns_to_drop_indices]
stud_exams
stud_exams$exam_score = stud_exams$score
stud_exams <- stud_exams[, -c(2)]
head(stud_exams)
stud_exams
VLE
vle
vle_filtered <- subset(vle, !is.na(week_from))
vle_filtered
studentVle
avg_per_site <- studentVle %>% group_by(id_student, id_site, code_module, code_presentation) %>% summarize_all(mean) %>% ungroup()
avg_per_site
avg_per_student <- avg_per_site %>% group_by(id_student, code_module, code_presentation) %>% summarize(date = mean(date), sum_click = mean(sum_click), .groups = "drop")
avg_per_student
StudentInfo
studInfo <- studentInfo %>% filter(final_result != "Withdrawn") %>% select(code_module, code_presentation, id_student, num_of_prev_attempts, final_result)
studInfo
Compiling all relevant tables
df_1 <- inner_join(avg_grade, pass_rate, by = c("id_student", "code_module", "code_presentation"))
names(stud_exams) <- c("id_student", "code_module", "code_presentation", "exam_score")
assessment_info <- inner_join(df_1, stud_exams, by = c("id_student", "code_module", "code_presentation"))
assessment_info
df_2 <- inner_join(studInfo, assessment_info, by = c("id_student", "code_module", "code_presentation"))
final_df <- inner_join(df_2, avg_per_student, by = c("id_student", "code_module", "code_presentation"))
final_df
final_df <- select(final_df, -id_student, -code_module, -code_presentation)
final_df
EDA (Exploratory Data Analysis)
library(psych)
describe(final_df)
str(final_df)
## 'data.frame': 4950 obs. of 7 variables:
## $ num_of_prev_attempts: int 0 0 0 0 0 0 0 0 0 0 ...
## $ final_result : chr "Distinction" "Pass" "Pass" "Pass" ...
## $ total_weighted_grade: num 89.6 84.6 51.4 75.1 93.2 ...
## $ pass_rate : num 1 1 0.625 1 1 1 0.5 1 1 0.875 ...
## $ exam_score : int 94 76 66 50 98 100 68 84 90 66 ...
## $ date : num 103.5 87.6 49.2 118.7 75.8 ...
## $ sum_click : num 2.71 1.54 1.56 2.19 2.1 ...
glimpse(final_df)
## Rows: 4,950
## Columns: 7
## $ num_of_prev_attempts <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ final_result <chr> "Distinction", "Pass", "Pass", "Pass", "Distincti…
## $ total_weighted_grade <dbl> 89.65, 84.58, 51.44, 75.13, 93.22, 91.41, 18.23, …
## $ pass_rate <dbl> 1.000, 1.000, 0.625, 1.000, 1.000, 1.000, 0.500, …
## $ exam_score <int> 94, 76, 66, 50, 98, 100, 68, 84, 90, 66, 100, 82,…
## $ date <dbl> 103.45791, 87.61726, 49.18129, 118.69864, 75.7919…
## $ sum_click <dbl> 2.706754, 1.539047, 1.562619, 2.189217, 2.100617,…
quant_final_df <- final_df[, -which(names(final_df) %in% c("final_result"))]
quant_final_df
col1 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "white", "cyan",
"#007FFF", "blue", "#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
"#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))
col3 <- colorRampPalette(c("red", "white", "blue"))
col4 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "#7FFF7F",
"cyan", "#007FFF", "blue", "#00007F"))
wb <- c("white", "black")
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(quant_final_df)
cor_matrix
## num_of_prev_attempts total_weighted_grade pass_rate
## num_of_prev_attempts 1.00000000 -0.06306455 -0.03817519
## total_weighted_grade -0.06306455 1.00000000 0.87427114
## pass_rate -0.03817519 0.87427114 1.00000000
## exam_score -0.10428765 0.44905358 0.27791733
## date 0.11032714 0.02119136 0.02755571
## sum_click -0.07787152 -0.17123406 -0.19486350
## exam_score date sum_click
## num_of_prev_attempts -0.1042876471 0.11032714 -0.0778715206
## total_weighted_grade 0.4490535767 0.02119136 -0.1712340590
## pass_rate 0.2779173251 0.02755571 -0.1948635039
## exam_score 1.0000000000 0.09634348 -0.0001294788
## date 0.0963434802 1.00000000 0.2148798489
## sum_click -0.0001294788 0.21487985 1.0000000000
summary(cor_matrix)
## num_of_prev_attempts total_weighted_grade pass_rate
## Min. :-0.10429 Min. :-0.1712 Min. :-0.19486
## 1st Qu.:-0.07417 1st Qu.:-0.0420 1st Qu.:-0.02174
## Median :-0.05062 Median : 0.2351 Median : 0.15274
## Mean : 0.13782 Mean : 0.3517 Mean : 0.32445
## 3rd Qu.: 0.07320 3rd Qu.: 0.7680 3rd Qu.: 0.72518
## Max. : 1.00000 Max. : 1.0000 Max. : 1.00000
## exam_score date sum_click
## Min. :-0.10429 Min. :0.02119 Min. :-0.1949
## 1st Qu.: 0.02399 1st Qu.:0.04475 1st Qu.:-0.1479
## Median : 0.18713 Median :0.10334 Median :-0.0390
## Mean : 0.28648 Mean :0.24505 Mean : 0.1285
## 3rd Qu.: 0.40627 3rd Qu.:0.18874 3rd Qu.: 0.1611
## Max. : 1.00000 Max. :1.00000 Max. : 1.0000
par(mar = c(2,2,2,2))
corrplot(cor_matrix, method = "circle", type = "full", addrect = 2, col = col1(100), tl.col = "black", tl.srt = 45, addCoef.col = "black", cl.ratio = 0.4, cl.align = "r")

library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(final_df, aes(x = final_result, fill = final_result)) + geom_bar() + labs(title = "Count Plot of Final Results", x = "Final Result", y = "Count") + theme_minimal() + scale_fill_manual(values = c("Pass" = "green", "Fail" = "red", "Distinction" = "blue", "Withdrawn" = "gray"))

library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(quant_final_df)

subset(final_df, sum_click > 10)
subset(final_df, num_of_prev_attempts > 4)
filtered_df <- subset(final_df, final_df$sum_click <=10)
filtered_df <- subset(final_df, final_df$num_of_prev_attempts <=4)
filtered_df
Modeling
# Load the required library
library(caret)
## Loading required package: lattice
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# Split the data into training and testing sets
set.seed(123) # Set a seed for reproducibility
splitIndex <- createDataPartition(final_df$final_result, p = 0.7, list = FALSE)
X_train <- final_df[splitIndex, !names(final_df) %in% "final_result"]
y_train <- final_df$final_result[splitIndex]
X_test <- final_df[-splitIndex, !names(final_df) %in% "final_result"]
y_test <- final_df$final_result[-splitIndex]
# Create scalers for each dataset
scaler1 <- preProcess(X_train, method = c("range"))
scaler2 <- preProcess(X_train[, !names(X_train) %in% "weighted_grade"], method = c("range"))
scaler3 <- preProcess(X_train[, !names(X_train) %in% "pass_rate"], method = c("range"))
# Apply the scalers to the datasets
X1_train <- predict(scaler1, X_train)
X1_test <- predict(scaler1, X_test)
X2_train <- predict(scaler2, X_train[, !names(X_train) %in% "weighted_grade"])
X2_test <- predict(scaler2, X_test[, !names(X_test) %in% "weighted_grade"])
X3_train <- predict(scaler3, X_train[, !names(X_train) %in% "pass_rate"])
X3_test <- predict(scaler3, X_test[, !names(X_test) %in% "pass_rate"])
Linear Discriminant Analysis
# Load the required library
library(MASS) # This package contains Linear Discriminant Analysis
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Fit LDA on the training data
lda1 <- lda(y_train ~ ., data = data.frame(X1_train, y_train))
# Transform the test data
X1_test_lda <- predict(lda1, newdata = data.frame(X1_test))
# Predict on the transformed test data
result_lda1 <- predict(lda1, newdata = data.frame(X1_test))
# Calculate and print confusion matrix
confusion_matrix <- table(Actual = y_test, Predicted = result_lda1$class)
print(confusion_matrix)
## Predicted
## Actual Distinction Fail Pass
## Distinction 231 0 33
## Fail 0 151 49
## Pass 49 48 923
cat("\n")
# Calculate and print additional metrics
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.8793801
cat("Precision: ", precision, "\n")
## Precision: 0.758794
cat("Recall: ", recall, "\n")
## Recall: 0.755
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7568922
# Fit LDA on the training data for dataset 2
lda2 <- lda(y_train ~ ., data = data.frame(X2_train, y_train))
# Transform the test data for dataset 2
X2_test_lda <- predict(lda2, newdata = data.frame(X2_test))
# Predict on the transformed test data for dataset 2
result_lda2 <- predict(lda2, newdata = data.frame(X2_test))
# Calculate and print confusion matrix for dataset 2
confusion_matrix <- table(Actual = y_test, Predicted = result_lda2$class)
print(confusion_matrix)
## Predicted
## Actual Distinction Fail Pass
## Distinction 231 0 33
## Fail 0 151 49
## Pass 49 48 923
cat("\n")
# Calculate and print additional metrics for dataset 2
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.8793801
cat("Precision: ", precision, "\n")
## Precision: 0.758794
cat("Recall: ", recall, "\n")
## Recall: 0.755
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7568922
# Fit LDA on the training data for dataset 3
lda3 <- lda(y_train ~ ., data = data.frame(X3_train, y_train))
# Transform the test data for dataset 3
X3_test_lda <- predict(lda3, newdata = data.frame(X3_test))
# Predict on the transformed test data for dataset 3
result_lda3 <- predict(lda3, newdata = data.frame(X3_test))
# Calculate and print confusion matrix for dataset 3
confusion_matrix <- table(Actual = y_test, Predicted = result_lda3$class)
print(confusion_matrix)
## Predicted
## Actual Distinction Fail Pass
## Distinction 234 0 30
## Fail 0 160 40
## Pass 52 45 923
cat("\n")
# Calculate and print additional metrics for dataset 3
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.8874663
cat("Precision: ", precision, "\n")
## Precision: 0.7804878
cat("Recall: ", recall, "\n")
## Recall: 0.8
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7901235